## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
##
## Loading required package: plyr
## Loading required package: stringr
Part of the idea is that for the Laplace (the standard) or exponential (what we are using) noising to work we have to plug in a sigma (level of noising). We simulate having a very good methodology to do so by supplying dCal a large calibration set to pick sigma. In practice you don’t have such a set and would need to either know sigma from first principles or experience, or use some of your training data to build it. What we want to demonstrate is the effectiveness of the differential privacy inspired noising techniques, so we will give it a good sigma (which one may or may not have in actual practice).
print('naive effects model')
## [1] "naive effects model"
bCoder <- trainEffectCoderR(dTrain,yName,vars,0)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.36368 -0.38461 0.00547 0.37974 1.76628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.08056 0.01285 6.270 4.44e-10 ***
## x1 0.26619 0.02110 12.617 < 2e-16 ***
## x2 0.25160 0.03405 7.389 2.18e-13 ***
## x3 0.19060 0.05277 3.612 0.000312 ***
## x4 0.27750 0.02483 11.178 < 2e-16 ***
## x5 0.27266 0.02451 11.122 < 2e-16 ***
## x6 0.21281 0.03761 5.658 1.75e-08 ***
## x7 0.25934 0.02207 11.752 < 2e-16 ***
## x8 0.28974 0.02194 13.208 < 2e-16 ***
## x9 0.24599 0.02020 12.175 < 2e-16 ***
## x10 0.26001 0.01797 14.465 < 2e-16 ***
## n1 0.09067 0.01400 6.475 1.19e-10 ***
## n2 0.09275 0.01416 6.552 7.26e-11 ***
## n3 0.11523 0.01392 8.277 2.31e-16 ***
## n4 0.08853 0.01403 6.308 3.48e-10 ***
## n5 0.07794 0.01441 5.410 7.09e-08 ***
## n6 0.09353 0.01446 6.468 1.25e-10 ***
## n7 0.08617 0.01443 5.974 2.74e-09 ***
## n8 0.10136 0.01409 7.196 8.80e-13 ***
## n9 0.09799 0.01410 6.949 5.00e-12 ***
## n10 0.10318 0.01430 7.214 7.70e-13 ***
## n11 0.11272 0.01402 8.041 1.53e-15 ***
## n12 0.08253 0.01430 5.773 9.04e-09 ***
## n13 0.10629 0.01451 7.324 3.51e-13 ***
## n14 0.10574 0.01409 7.502 9.46e-14 ***
## n15 0.11166 0.01427 7.827 8.13e-15 ***
## n16 0.08986 0.01379 6.518 9.01e-11 ***
## n17 0.08851 0.01406 6.295 3.78e-10 ***
## n18 0.09905 0.01404 7.056 2.36e-12 ***
## n19 0.09338 0.01476 6.328 3.07e-10 ***
## n20 0.11530 0.01387 8.311 < 2e-16 ***
## n21 0.09641 0.01424 6.769 1.71e-11 ***
## n22 0.10090 0.01397 7.225 7.15e-13 ***
## n23 0.08629 0.01374 6.282 4.10e-10 ***
## n24 0.10409 0.01408 7.392 2.12e-13 ***
## n25 0.08755 0.01444 6.063 1.59e-09 ***
## n26 0.09146 0.01401 6.528 8.46e-11 ***
## n27 0.10086 0.01433 7.038 2.69e-12 ***
## n28 0.10048 0.01373 7.316 3.72e-13 ***
## n29 0.09177 0.01455 6.305 3.55e-10 ***
## n30 0.08877 0.01417 6.265 4.57e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5746 on 1959 degrees of freedom
## Multiple R-squared: 0.927, Adjusted R-squared: 0.9255
## F-statistic: 622.2 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 0.568682746837468"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
'naive effects model train',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## Warning: Removed 5 rows containing missing values (geom_smooth).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.140]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.80270755169949"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
'naive effects model test',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.293]
# try re-fitting pred to see if it is just a shift/scale issue
# it is unlikely to be such, but good to look
f2 <- paste(yName,'pred',sep=' ~ ')
m2 <- lm(f2,data=dTestB) # dileberately fitting on test itself (to show it does not help)
print(summary(m2))
##
## Call:
## lm(formula = f2, data = dTestB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6308 -1.2125 -0.0242 1.1666 6.6364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08973 0.01789 -5.016 5.37e-07 ***
## pred 1.35606 0.02195 61.781 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.779 on 9998 degrees of freedom
## Multiple R-squared: 0.2763, Adjusted R-squared: 0.2762
## F-statistic: 3817 on 1 and 9998 DF, p-value: < 2.2e-16
dTestB$pred2 <- predict(m2,newdata=dTestB)
print(paste('adjusted test rmse',rmse(dTestB$pred2,dTestB[[yName]])))
## [1] "adjusted test rmse 1.77849696252789"
print(WVPlots::ScatterHist(dTestB,'pred2',yName,
'naive effects model test (adjusted on test)',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.446]
print(paste('effects model, sigma=',bSigmaBest))
## [1] "effects model, sigma= 15"
bCoder <- trainEffectCoderR(dTrain,yName,vars,bSigmaBest)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3712 -0.7363 -0.0120 0.7024 3.7097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0805556 0.0237408 3.393 0.000705 ***
## x1 0.8521069 0.0364155 23.400 < 2e-16 ***
## x2 0.8218086 0.0598171 13.739 < 2e-16 ***
## x3 0.6195162 0.0786974 7.872 5.72e-15 ***
## x4 0.9505478 0.0453301 20.969 < 2e-16 ***
## x5 0.9201665 0.0430214 21.389 < 2e-16 ***
## x6 0.7456058 0.0670167 11.126 < 2e-16 ***
## x7 0.8907750 0.0370966 24.012 < 2e-16 ***
## x8 0.9432426 0.0353649 26.672 < 2e-16 ***
## x9 0.8120753 0.0309308 26.255 < 2e-16 ***
## x10 0.7901604 0.0265552 29.755 < 2e-16 ***
## n1 0.0038486 0.0039417 0.976 0.329001
## n2 0.0075950 0.0036580 2.076 0.037999 *
## n3 0.0014070 0.0040803 0.345 0.730267
## n4 0.0077329 0.0044794 1.726 0.084449 .
## n5 -0.0003303 0.0037112 -0.089 0.929086
## n6 0.0058793 0.0038900 1.511 0.130851
## n7 0.0093407 0.0036643 2.549 0.010877 *
## n8 0.0099106 0.0040796 2.429 0.015217 *
## n9 0.0091875 0.0039583 2.321 0.020384 *
## n10 0.0022582 0.0038785 0.582 0.560471
## n11 0.0100323 0.0039660 2.530 0.011499 *
## n12 0.0130399 0.0038630 3.376 0.000751 ***
## n13 0.0061204 0.0042401 1.443 0.149052
## n14 0.0067147 0.0040914 1.641 0.100919
## n15 0.0004988 0.0037050 0.135 0.892922
## n16 0.0095963 0.0043916 2.185 0.028998 *
## n17 0.0004090 0.0038043 0.108 0.914403
## n18 0.0081236 0.0035346 2.298 0.021649 *
## n19 0.0049638 0.0040072 1.239 0.215603
## n20 0.0084900 0.0043625 1.946 0.051782 .
## n21 0.0056261 0.0040532 1.388 0.165269
## n22 0.0052225 0.0039597 1.319 0.187346
## n23 0.0159540 0.0038519 4.142 3.59e-05 ***
## n24 0.0064288 0.0036831 1.745 0.081060 .
## n25 0.0090151 0.0040357 2.234 0.025606 *
## n26 0.0050930 0.0033655 1.513 0.130366
## n27 0.0068078 0.0042229 1.612 0.107094
## n28 0.0100391 0.0035710 2.811 0.004984 **
## n29 0.0067839 0.0042857 1.583 0.113604
## n30 0.0065435 0.0042774 1.530 0.126228
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.062 on 1959 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7458
## F-statistic: 147.6 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 1.05078249365702"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
paste('effects model train, sigma=',bSigmaBest),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.635]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.15434085305498"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
paste('effects model test, sigma=',bSigmaBest),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.788]
print('effects model, jacknifed')
## [1] "effects model, jacknifed"
bCoder <- trainEffectCoderR(dTrain,yName,vars,0)
# dTrainB <- bCoder$codeFrame(dTrain)
# dTrainB <- bCoder$codeFrame(dCal)
dTrainB <- jackknifeEffectCodeR(dTrain,yName,vars)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0503 -0.7460 0.0110 0.6973 3.9378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.080580 0.024499 3.289 0.00102 **
## x1 0.878376 0.036480 24.078 < 2e-16 ***
## x2 0.891187 0.062232 14.320 < 2e-16 ***
## x3 0.769654 0.098820 7.788 1.09e-14 ***
## x4 0.944314 0.043679 21.619 < 2e-16 ***
## x5 0.945152 0.042897 22.033 < 2e-16 ***
## x6 0.849552 0.069355 12.249 < 2e-16 ***
## x7 0.949746 0.037648 25.227 < 2e-16 ***
## x8 1.043654 0.036410 28.664 < 2e-16 ***
## x9 0.962349 0.033361 28.847 < 2e-16 ***
## x10 0.925969 0.029086 31.836 < 2e-16 ***
## n1 0.027136 0.020443 1.327 0.18454
## n2 -0.009259 0.020976 -0.441 0.65896
## n3 0.031421 0.020508 1.532 0.12565
## n4 -0.030794 0.020589 -1.496 0.13491
## n5 -0.020427 0.020843 -0.980 0.32720
## n6 0.009093 0.021514 0.423 0.67261
## n7 0.013300 0.021150 0.629 0.52953
## n8 0.008724 0.020401 0.428 0.66897
## n9 -0.001165 0.020370 -0.057 0.95440
## n10 -0.001888 0.021453 -0.088 0.92990
## n11 0.012744 0.020469 0.623 0.53364
## n12 -0.001472 0.020607 -0.071 0.94306
## n13 -0.034807 0.020765 -1.676 0.09385 .
## n14 0.019618 0.020409 0.961 0.33655
## n15 0.027026 0.020765 1.302 0.19323
## n16 -0.002004 0.019604 -0.102 0.91859
## n17 0.003144 0.020437 0.154 0.87777
## n18 0.008286 0.020588 0.402 0.68738
## n19 -0.040983 0.021189 -1.934 0.05324 .
## n20 -0.012028 0.020187 -0.596 0.55136
## n21 -0.024144 0.021062 -1.146 0.25179
## n22 0.016472 0.020233 0.814 0.41567
## n23 -0.012645 0.020121 -0.628 0.52976
## n24 0.020160 0.021181 0.952 0.34132
## n25 -0.049179 0.020551 -2.393 0.01680 *
## n26 0.021604 0.020720 1.043 0.29723
## n27 -0.022748 0.021124 -1.077 0.28166
## n28 -0.008041 0.020318 -0.396 0.69231
## n29 -0.036017 0.021344 -1.687 0.09167 .
## n30 0.009637 0.020581 0.468 0.63968
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.095 on 1959 degrees of freedom
## Multiple R-squared: 0.7349, Adjusted R-squared: 0.7295
## F-statistic: 135.8 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 1.08397286675349"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
'effects model train, jackknifed',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.965]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.0755272623194"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
'effects model test, jackknifed',
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1118]
print("vtreat split model")
## [1] "vtreat split model"
pruneSig = 0.05
print("working vtreat split model")
## [1] "working vtreat split model"
mTitle <- 'vtreat split model'
isTrain <- runif(nrow(dTrain))<=0.5
dTrainDT <- dTrain[isTrain,]
dTrainDC <- dTrain[!isTrain,]
treatments <- vtreat::designTreatmentsN(dTrainDC,vars,yName,
rareSig=0.3,
smFactor=5.0,
minFraction=2.0,
verbose=FALSE,
parallelCluster=cl)
dTrainV <- vtreat::prepare(treatments,dTrainDT,pruneSig=pruneSig,
parallelCluster=cl)
#print(treatments$scoreFrame)
varsV <- intersect(colnames(dTrainV),
treatments$scoreFrame$varName[treatments$scoreFrame$sig<pruneSig])
print(varsV)
## [1] "x1_catN" "x2_catN" "x4_catN" "x5_catN" "x6_catN" "x7_catN"
## [7] "x7_catD" "x8_catP" "x8_catN" "x9_catN" "x9_catD" "x10_catP"
## [13] "x10_catN" "x10_catD" "n3_catD" "n12_catN"
dTestV <- vtreat::prepare(treatments,dTest,pruneSig=pruneSig,
varRestriction=varsV,
parallelCluster=cl)
formulaV <- paste(yName,paste(varsV,collapse=' + '),sep=' ~ ')
modelV <- lm(formulaV,data=dTrainV)
dTrainV$pred <- predict(modelV,newdata=dTrainV)
print(paste('train rmse',rmse(dTrainV$pred,dTrainV[[yName]])))
## [1] "train rmse 1.15109777130287"
print(WVPlots::ScatterHist(dTrainV,'pred',yName,
paste(mTitle,'train'),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1295]
dTestV$pred <- predict(modelV,newdata=dTestV)
print(paste('test rmse',rmse(dTestV$pred,dTestV[[yName]])))
## [1] "test rmse 1.14434620432544"
print(WVPlots::ScatterHist(dTestV,'pred',yName,
paste(mTitle,'test'),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1448]
print("vtreat cross model")
## [1] "vtreat cross model"
pruneSig = 0.05
if("mkCrossFrameNExperiment" %in% ls('package:vtreat')) {
print("working vtreat cross model")
mTitle <- 'vtreat cross model'
crossD <- vtreat::mkCrossFrameNExperiment(dTrain,vars,yName,
rareSig=0.3,
smFactor=5.0,
minFraction=2.0,
parallelCluster=cl)
treatments <- crossD$treatments
dTrainV <- crossD$crossFrame
# print(treatments$scoreFrame)
varsV <- intersect(colnames(dTrainV),
treatments$scoreFrame$varName[treatments$scoreFrame$sig<pruneSig])
print(varsV)
dTestV <- vtreat::prepare(treatments,dTest,pruneSig=pruneSig,
varRestriction=varsV,
parallelCluster=cl)
formulaV <- paste(yName,paste(varsV,collapse=' + '),sep=' ~ ')
modelV <- lm(formulaV,data=dTrainV)
print(summary(modelV))
dTrainV$pred <- predict(modelV,newdata=dTrainV)
print(paste('train rmse',rmse(dTrainV$pred,dTrainV[[yName]])))
print(WVPlots::ScatterHist(dTrainV,'pred',yName,
paste(mTitle,'train'),
smoothmethod='lm',annot_size=2))
dTestV$pred <- predict(modelV,newdata=dTestV)
print(paste('test rmse',rmse(dTestV$pred,dTestV[[yName]])))
print(WVPlots::ScatterHist(dTestV,'pred',yName,
paste(mTitle,'test'),
smoothmethod='lm',annot_size=2))
} else {
print("cross model not in library")
}
## [1] "working vtreat cross model"
## [1] "x1_catN" "x2_catN" "x3_catN" "x4_catP" "x4_catN" "x5_catN"
## [7] "x6_catN" "x7_catN" "x7_catD" "x8_catN" "x8_catD" "x9_catN"
## [13] "x10_catP" "x10_catN" "n19_catN" "n23_catN"
##
## Call:
## lm(formula = formulaV, data = dTrainV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3691 -0.7844 -0.0079 0.7689 4.4618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.373728 0.775279 -0.482 0.630
## x1_catN 0.875961 0.038728 22.618 < 2e-16 ***
## x2_catN 0.870092 0.067330 12.923 < 2e-16 ***
## x3_catN 0.753288 0.105224 7.159 1.14e-12 ***
## x4_catP -2.199028 3.061333 -0.718 0.473
## x4_catN 0.938579 0.047969 19.567 < 2e-16 ***
## x5_catN 0.958568 0.046062 20.811 < 2e-16 ***
## x6_catN 0.827816 0.072774 11.375 < 2e-16 ***
## x7_catN 0.934857 0.041977 22.271 < 2e-16 ***
## x7_catD 0.806048 0.200909 4.012 6.24e-05 ***
## x8_catN 1.077636 0.039708 27.139 < 2e-16 ***
## x8_catD -0.339598 0.195158 -1.740 0.082 .
## x9_catN 0.978051 0.035411 27.620 < 2e-16 ***
## x10_catP -3.007540 4.152147 -0.724 0.469
## x10_catN 0.957184 0.034815 27.493 < 2e-16 ***
## n19_catN -0.138959 0.072070 -1.928 0.054 .
## n23_catN 0.003044 0.068898 0.044 0.965
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.145 on 1983 degrees of freedom
## Multiple R-squared: 0.7067, Adjusted R-squared: 0.7043
## F-statistic: 298.6 on 16 and 1983 DF, p-value: < 2.2e-16
##
## [1] "train rmse 1.14013608960818"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1625]
## [1] "test rmse 1.06117068380281"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1778]
sSigma = 100
print(paste('effects model, smoothing=',sSigma))
## [1] "effects model, smoothing= 100"
bCoder <- trainEffectCoderLSmooth(dTrain,yName,vars,sSigma)
dTrainB <- bCoder$codeFrameR(dTrain)
dTestB <- bCoder$codeFrameR(dTest)
varsB <- setdiff(colnames(dTrainB),yVars)
formulaB <- paste(yName,paste(varsB,collapse=' + '),sep=' ~ ')
modelB <- lm(formulaB,data=dTrainB)
print(summary(modelB))
##
## Call:
## lm(formula = formulaB, data = dTrainB)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0123 -0.4071 0.0243 0.4038 2.0010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03157 0.01411 2.238 0.0253 *
## x1 0.46900 0.03433 13.661 < 2e-16 ***
## x2 0.43635 0.05605 7.785 1.12e-14 ***
## x3 0.35879 0.08629 4.158 3.35e-05 ***
## x4 0.48251 0.04054 11.902 < 2e-16 ***
## x5 0.49399 0.03994 12.369 < 2e-16 ***
## x6 0.36647 0.06119 5.989 2.50e-09 ***
## x7 0.45372 0.03566 12.725 < 2e-16 ***
## x8 0.51743 0.03583 14.443 < 2e-16 ***
## x9 0.44220 0.03236 13.667 < 2e-16 ***
## x10 0.46038 0.02866 16.066 < 2e-16 ***
## n1 2.50847 0.35889 6.990 3.77e-12 ***
## n2 2.48761 0.36780 6.764 1.77e-11 ***
## n3 3.15687 0.35059 9.004 < 2e-16 ***
## n4 2.56514 0.36398 7.048 2.51e-12 ***
## n5 1.88568 0.36896 5.111 3.52e-07 ***
## n6 2.19744 0.37914 5.796 7.90e-09 ***
## n7 2.08384 0.35166 5.926 3.66e-09 ***
## n8 2.36660 0.34844 6.792 1.46e-11 ***
## n9 2.46809 0.35716 6.910 6.51e-12 ***
## n10 2.20245 0.35444 6.214 6.30e-10 ***
## n11 2.68048 0.34446 7.782 1.15e-14 ***
## n12 1.99004 0.37429 5.317 1.18e-07 ***
## n13 2.61391 0.37424 6.984 3.90e-12 ***
## n14 2.43482 0.35298 6.898 7.09e-12 ***
## n15 2.57561 0.36112 7.132 1.38e-12 ***
## n16 2.43798 0.34197 7.129 1.41e-12 ***
## n17 2.25436 0.34132 6.605 5.11e-11 ***
## n18 2.52469 0.35400 7.132 1.39e-12 ***
## n19 2.50887 0.37768 6.643 3.97e-11 ***
## n20 2.24130 0.34401 6.515 9.21e-11 ***
## n21 2.25363 0.36182 6.229 5.75e-10 ***
## n22 2.29114 0.33994 6.740 2.08e-11 ***
## n23 1.94294 0.33639 5.776 8.88e-09 ***
## n24 2.42563 0.35719 6.791 1.47e-11 ***
## n25 1.98204 0.35585 5.570 2.90e-08 ***
## n26 2.16032 0.33883 6.376 2.26e-10 ***
## n27 2.35022 0.37638 6.244 5.21e-10 ***
## n28 2.07056 0.34846 5.942 3.32e-09 ***
## n29 2.03861 0.35614 5.724 1.20e-08 ***
## n30 2.27701 0.34654 6.571 6.40e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6266 on 1959 degrees of freedom
## Multiple R-squared: 0.9132, Adjusted R-squared: 0.9115
## F-statistic: 515.5 on 40 and 1959 DF, p-value: < 2.2e-16
dTrainB$pred <- predict(modelB,newdata=dTrainB)
print(paste('train rmse',rmse(dTrainB$pred,dTrainB[[yName]])))
## [1] "train rmse 0.620113653647223"
print(WVPlots::ScatterHist(dTrainB,'pred',yName,
paste('effects model train, smoothing=',sSigma),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1955]
dTestB$pred <- predict(modelB,newdata=dTestB)
print(paste('test rmse',rmse(dTestB$pred,dTestB[[yName]])))
## [1] "test rmse 1.6938494493661"
print(WVPlots::ScatterHist(dTestB,'pred',yName,
paste('effects model test, smoothing=',sSigma),
smoothmethod='lm',annot_size=2))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.2108]
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl <- NULL
}